.

# install packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(gganimate)
library(magick)
## Linking to ImageMagick 6.9.9.39
## Enabled features: cairo, fontconfig, freetype, lcms, pango, rsvg, webp
## Disabled features: fftw, ghostscript, x11
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(corrplot)
## corrplot 0.84 loaded
library(beepr)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(praise)
library(transformr)
library(sf)
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
## 
## Attaching package: 'sf'
## The following object is masked from 'package:transformr':
## 
##     st_normalize
library(here)
## here() starts at /Users/allisonbailey/Desktop/Fall 2019 Courses/ESM 206-Data/Lab Assignments/Lab 10/esm206-f2019-lab-10

Objectives

Multiple Linear Regression

homes <- read_csv("slo_homes.csv")
## Parsed with column specification:
## cols(
##   City = col_character(),
##   Price = col_double(),
##   Bedrooms = col_double(),
##   Bathrooms = col_double(),
##   SqFt = col_double(),
##   PricePerSqFt = col_double(),
##   Status = col_character()
## )
homes_clean <- homes %>%
  clean_names() %>%
  filter(city %in% c("San Luis Obispo", "Atascadero", "Arroyo Grande"))

Look at correlations between numeric variables (checking to see if we think that collinearity might be a concern)

homes_cor <- cor(homes_clean[2:6])

homes_cor
##                     price    bedrooms bathrooms     sq_ft price_per_sq_ft
## price           1.0000000  0.31152578 0.4991453 0.6618575      0.73664029
## bedrooms        0.3115258  1.00000000 0.7519186 0.6960233     -0.09994633
## bathrooms       0.4991453  0.75191865 1.0000000 0.8046112      0.07624770
## sq_ft           0.6618575  0.69602326 0.8046112 1.0000000      0.12549708
## price_per_sq_ft 0.7366403 -0.09994633 0.0762477 0.1254971      1.00000000

Make a correlogram (plot of correlations)

corrplot(homes_cor)

beep(20)

Let’s start with a complete model:

home_lm <- lm(price ~ city + bedrooms + bathrooms + sq_ft + status, data = homes_clean)

home_lm
## 
## Call:
## lm(formula = price ~ city + bedrooms + bathrooms + sq_ft + status, 
##     data = homes_clean)
## 
## Coefficients:
##         (Intercept)       cityAtascadero  citySan Luis Obispo  
##            184130.9            -167396.4              31018.1  
##            bedrooms            bathrooms                sq_ft  
##           -161645.5              48692.0                389.1  
##       statusRegular     statusShort Sale  
##            303964.6             -19828.6
# P = 184130.9 - 167396(Atascadero) + 31018(SLO) - 161645(bedrooms)

If everything else about the home is the same, then we would expect a home in Atascadero to sell for $167,396 dollars LESS than a home in Arroyo Grande, on average.

home_lm2 <- lm(price ~ city + sq_ft + status, data = homes_clean)
home_lm2
## 
## Call:
## lm(formula = price ~ city + sq_ft + status, data = homes_clean)
## 
## Coefficients:
##         (Intercept)       cityAtascadero  citySan Luis Obispo  
##             -105821              -182587                70279  
##               sq_ft        statusRegular     statusShort Sale  
##                 325               315441                31284

AIC values:

AIC(home_lm)
## [1] 3576.834
AIC(home_lm2)
## [1] 3584.3
summary(home_lm)
## 
## Call:
## lm(formula = price ~ city + bedrooms + bathrooms + sq_ft + status, 
##     data = homes_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -843938 -115963  -12298  109718 3445077 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          184130.93  143368.84   1.284  0.20157    
## cityAtascadero      -167396.39   78701.95  -2.127  0.03552 *  
## citySan Luis Obispo   31018.14  114895.10   0.270  0.78766    
## bedrooms            -161645.51   49414.32  -3.271  0.00141 ** 
## bathrooms             48692.04   52408.63   0.929  0.35476    
## sq_ft                   389.12      53.17   7.318 3.43e-11 ***
## statusRegular        303964.64  105942.81   2.869  0.00488 ** 
## statusShort Sale     -19828.56   86335.35  -0.230  0.81875    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 380600 on 117 degrees of freedom
## Multiple R-squared:  0.5682, Adjusted R-squared:  0.5424 
## F-statistic:    22 on 7 and 117 DF,  p-value: < 2.2e-16
summary(home_lm2)
## 
## Call:
## lm(formula = price ~ city + sq_ft + status, data = homes_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -653118 -120914   -8844  101402 3644738 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -105820.71  118697.40  -0.892   0.3745    
## cityAtascadero      -182586.90   80683.44  -2.263   0.0254 *  
## citySan Luis Obispo   70278.97  115846.31   0.607   0.5452    
## sq_ft                   325.03      31.54  10.306   <2e-16 ***
## statusRegular        315441.26  109749.65   2.874   0.0048 ** 
## statusShort Sale      31284.44   87356.11   0.358   0.7209    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 395100 on 119 degrees of freedom
## Multiple R-squared:  0.5268, Adjusted R-squared:  0.5069 
## F-statistic: 26.49 on 5 and 119 DF,  p-value: < 2.2e-16

Now I’m going to create a nice regression table with ‘stargazer’ :

stargazer(home_lm, home_lm2, type = "html")
Dependent variable:
price
(1) (2)
cityAtascadero -167,396.400** -182,586.900**
(78,701.950) (80,683.440)
citySan Luis Obispo 31,018.140 70,278.970
(114,895.100) (115,846.300)
bedrooms -161,645.500***
(49,414.320)
bathrooms 48,692.040
(52,408.630)
sq_ft 389.120*** 325.028***
(53.171) (31.538)
statusRegular 303,964.600*** 315,441.300***
(105,942.800) (109,749.700)
statusShort Sale -19,828.560 31,284.440
(86,335.350) (87,356.110)
Constant 184,130.900 -105,820.700
(143,368.800) (118,697.400)
Observations 125 125
R2 0.568 0.527
Adjusted R2 0.542 0.507
Residual Std. Error 380,586.300 (df = 117) 395,084.400 (df = 119)
F Statistic 21.997*** (df = 7; 117) 26.491*** (df = 5; 119)
Note: p<0.1; p<0.05; p<0.01

Now let’s check out the diagnostic plots to see if our assumptions (normality of residulas and homoscedasticity) are satisfied or if we’re really concerned.

plot(home_lm2)

Make some home price predictions with new data

We’re going to be using that simplified model (home_lm2)

new_df <- data.frame(city = rep(c("San Luis Obispo", "Arroyo Grande", "Atascadero"), each = 10), 
                     sq_ft = rep(seq(0, 5000, length = 10)),
                     status = "Regular")
new_df
##               city     sq_ft  status
## 1  San Luis Obispo    0.0000 Regular
## 2  San Luis Obispo  555.5556 Regular
## 3  San Luis Obispo 1111.1111 Regular
## 4  San Luis Obispo 1666.6667 Regular
## 5  San Luis Obispo 2222.2222 Regular
## 6  San Luis Obispo 2777.7778 Regular
## 7  San Luis Obispo 3333.3333 Regular
## 8  San Luis Obispo 3888.8889 Regular
## 9  San Luis Obispo 4444.4444 Regular
## 10 San Luis Obispo 5000.0000 Regular
## 11   Arroyo Grande    0.0000 Regular
## 12   Arroyo Grande  555.5556 Regular
## 13   Arroyo Grande 1111.1111 Regular
## 14   Arroyo Grande 1666.6667 Regular
## 15   Arroyo Grande 2222.2222 Regular
## 16   Arroyo Grande 2777.7778 Regular
## 17   Arroyo Grande 3333.3333 Regular
## 18   Arroyo Grande 3888.8889 Regular
## 19   Arroyo Grande 4444.4444 Regular
## 20   Arroyo Grande 5000.0000 Regular
## 21      Atascadero    0.0000 Regular
## 22      Atascadero  555.5556 Regular
## 23      Atascadero 1111.1111 Regular
## 24      Atascadero 1666.6667 Regular
## 25      Atascadero 2222.2222 Regular
## 26      Atascadero 2777.7778 Regular
## 27      Atascadero 3333.3333 Regular
## 28      Atascadero 3888.8889 Regular
## 29      Atascadero 4444.4444 Regular
## 30      Atascadero 5000.0000 Regular

Use the ‘predict()’ to make new predicitons using home_lm2:

predict_df <- predict(home_lm2, newdata = new_df)
predict_df
##          1          2          3          4          5          6 
##  279899.52  460470.61  641041.69  821612.78 1002183.86 1182754.94 
##          7          8          9         10         11         12 
## 1363326.03 1543897.11 1724468.20 1905039.28  209620.55  390191.63 
##         13         14         15         16         17         18 
##  570762.72  751333.80  931904.89 1112475.97 1293047.06 1473618.14 
##         19         20         21         22         23         24 
## 1654189.23 1834760.31   27033.65  207604.73  388175.82  568746.90 
##         25         26         27         28         29         30 
##  749317.99  929889.07 1110460.16 1291031.24 1471602.33 1652173.41
# Bind that together with the new_df:
full_df <- data.frame(new_df, predict_df)

full_df
##               city     sq_ft  status predict_df
## 1  San Luis Obispo    0.0000 Regular  279899.52
## 2  San Luis Obispo  555.5556 Regular  460470.61
## 3  San Luis Obispo 1111.1111 Regular  641041.69
## 4  San Luis Obispo 1666.6667 Regular  821612.78
## 5  San Luis Obispo 2222.2222 Regular 1002183.86
## 6  San Luis Obispo 2777.7778 Regular 1182754.94
## 7  San Luis Obispo 3333.3333 Regular 1363326.03
## 8  San Luis Obispo 3888.8889 Regular 1543897.11
## 9  San Luis Obispo 4444.4444 Regular 1724468.20
## 10 San Luis Obispo 5000.0000 Regular 1905039.28
## 11   Arroyo Grande    0.0000 Regular  209620.55
## 12   Arroyo Grande  555.5556 Regular  390191.63
## 13   Arroyo Grande 1111.1111 Regular  570762.72
## 14   Arroyo Grande 1666.6667 Regular  751333.80
## 15   Arroyo Grande 2222.2222 Regular  931904.89
## 16   Arroyo Grande 2777.7778 Regular 1112475.97
## 17   Arroyo Grande 3333.3333 Regular 1293047.06
## 18   Arroyo Grande 3888.8889 Regular 1473618.14
## 19   Arroyo Grande 4444.4444 Regular 1654189.23
## 20   Arroyo Grande 5000.0000 Regular 1834760.31
## 21      Atascadero    0.0000 Regular   27033.65
## 22      Atascadero  555.5556 Regular  207604.73
## 23      Atascadero 1111.1111 Regular  388175.82
## 24      Atascadero 1666.6667 Regular  568746.90
## 25      Atascadero 2222.2222 Regular  749317.99
## 26      Atascadero 2777.7778 Regular  929889.07
## 27      Atascadero 3333.3333 Regular 1110460.16
## 28      Atascadero 3888.8889 Regular 1291031.24
## 29      Atascadero 4444.4444 Regular 1471602.33
## 30      Atascadero 5000.0000 Regular 1652173.41

Make a graph with the actuall data, and what our model actually predicts:

ggplot() +
  geom_point(data = homes_clean,
             aes(x = sq_ft, y = price, color = city, pch = city)) +
  geom_line(data = full_df,
            aes(x = sq_ft, y = predict_df, color = city)) +
  theme_light()

ggplot(data = homes_clean, aes(x= sq_ft, y = price)) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(~city)

Intro to the sf packages

‘sf’ = simple features

Sticky geometries! Which means we can just work with attributes like a normal data frame for wrangling, viz, etc.

Get CA dams data:

dams <- read_csv("ca_dams.csv") %>%
  clean_names() %>%
  drop_na(latitude) %>%
  drop_na(longitude) %>%
  drop_na(year_completed)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   RECORDID = col_double(),
##   STATEID = col_logical(),
##   LONGITUDE = col_double(),
##   LATITUDE = col_double(),
##   DISTANCE = col_double(),
##   YEAR_COMPLETED = col_double(),
##   DAM_LENGTH = col_double(),
##   DAM_HEIGHT = col_double(),
##   STRUCTURAL_HEIGHT = col_double(),
##   HYDRAULIC_HEIGHT = col_double(),
##   NID_HEIGHT = col_double(),
##   MAX_DISCHARGE = col_double(),
##   MAX_STORAGE = col_double(),
##   NORMAL_STORAGE = col_double(),
##   NID_STORAGE = col_double(),
##   SURFACE_AREA = col_double(),
##   DRAINAGE_AREA = col_double(),
##   INSPECTION_FREQUENCY = col_double(),
##   SPILLWAY_WIDTH = col_double(),
##   VOLUME = col_double()
##   # ... with 6 more columns
## )
## See spec(...) for full column specifications.
## Warning: 109 parsing failures.
##  row       col           expected actual          file
## 1337 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## 1338 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## 1339 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## 1340 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## 1341 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## .... ......... .................. ...... .............
## See problems(...) for more details.

Convert lat/lon numbers to simple features data (sf)

dams_sf <- st_as_sf(dams, coords = c("longitude", "latitude"))
st_crs(dams_sf) <- 4326

plot(dams_sf)
## Warning: plotting the first 9 out of 67 attributes; use max.plot = 67 to
## plot all
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf

Now, lets read n the shapefile data for the state of CA (outer boundary TIGER shapefile data)

ca_border <- read_sf(here::here("ca_state_border"), layer = "CA_State_TIGER2016")

plot(ca_border)
## Warning: plotting the first 9 out of 14 attributes; use max.plot = 14 to
## plot all

Now, plot dams n CA over the top of the CA outline:

ggplot() +
  geom_sf(data = ca_border, fill = "red") +
  geom_sf(data = dams_sf, aes(size = dam_height, color = county), alpha = 0.5, show.legend = FALSE) +
  theme_minimal()

Making an animated plot:

ggplot() +
  geom_sf(data = ca_border) +
  geom_sf(data = dams_sf, size = 1.5, color = "gray50") +
  theme_void() +
  labs(title = 'Year: {round(frame_time, 0)}') +
  transition_time(year_completed) +
  shadow_mark(alpha = 1)